Citi Bike Tutorial

The Data: combination of historical usage patterns with weather data in order to forecast bike rental demand in the Capital Bikeshare program in Washington, D.C.

You are provided hourly rental data spanning two years. The data set is comprised of the first 19 days of each month.

Before we begin predicting anything we will analyse the data to gain a better understanding and try to identify some of the key trands and possible decision variables.

In [ ]:
#Installing Packages
!pip install pandas
In [2]:
import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt
%matplotlib inline

import os
os.getcwd()
#https://plot.ly/python/offline/
Out[2]:
'C:\\Users\\marcus.ohanlon\\Notebooks'
In [3]:
import plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import cufflinks as cf
from plotly.graph_objs import *
init_notebook_mode()
import datetime as dt
cf.go_offline()
cf.set_config_file(theme='white')

Read in the data and create some date related columns

In [4]:
df = pd.read_csv('Bike.csv', parse_dates={'datetime'}, index_col='datetime')


df['month'] = df.index.month
#df['weekday'] = df.index.weekday
df['hour'] = df.index.hour
df['day'] = df.index.dayofweek
df['date'] = df.index.date
df['time'] = df.index.time

days = {0:'A:Mon',1:'B:Tues',2:'C:Weds',3:'D:Thurs',4:'E:Fri',5:'F:Sat',6:'G:Sun'}

df['day'] = df['day'].apply(lambda x: days[x])
##df['timefuzz'] =dt.datetime.combine(df.index.time)+datetime.timedelta(0,3).time()
df.head(1)
Out[4]:
season holiday workingday weather temp atemp humidity windspeed casual registered count month hour day date time
datetime
2011-01-01 1 0 0 1 9.84 14.395 81 0 3 13 16 1 0 F:Sat 2011-01-01 00:00:00
In [5]:
s = {1:'Spring',2:'Summer',3:'Autumn',4:'Winter'}
df['seasonName'] = df['season'].apply(lambda x: s[x])
df.head(1)
Out[5]:
season holiday workingday weather temp atemp humidity windspeed casual registered count month hour day date time seasonName
datetime
2011-01-01 1 0 0 1 9.84 14.395 81 0 3 13 16 1 0 F:Sat 2011-01-01 00:00:00 Spring

1.Summary Statistics

In [255]:
print df['season'].count()
print df['season'].min()
print df['season'].max()
print np.average(df['season'])
10886
1
4
2.50661399963
In [268]:
print df['date'].max() - df['date'].min()
718 days, 0:00:00
In [269]:
df['count'].sum()
Out[269]:
2085476L
In [271]:
df['count'].sum()/((df['date'].max() - df['date'].min()).days)
Out[271]:
2904L
In [6]:
df['season'].isnull().sum()
Out[6]:
0
In [7]:
df.isnull().sum()
Out[7]:
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
month         0
hour          0
day           0
date          0
time          0
seasonName    0
dtype: int64

Filtering

In [278]:
dffilt = df[(df['season']==1) & (df['count']>10)]
print dffilt['season'].max()
1
In [8]:
dffilt = df[(df['season']==1) | (df['season']==3)]
print dffilt['season'].max()
3

3.GroupBy

In [288]:
g1 = df[['count','registered','casual','season']].groupby(['season']).sum().sort_values('count',ascending=False)
g1.head()
Out[288]:
count registered casual
season
3 640662 497944 142718
2 588282 458610 129672
4 544034 465894 78140
1 312498 270893 41605
In [294]:
view = df[['casual','registered','holiday']].groupby('holiday').sum()
view
Out[294]:
casual registered
holiday
0 376964 1650704
1 15171 42637

4.Simple Plots

In [299]:
view.plot(kind='bar')
plt.title("bar")
Out[299]:
<matplotlib.text.Text at 0x2a586240>
In [306]:
df['casual'][:50].plot(figsize=(13,3))
df['registered'][:50].plot()
plt.legend()
plt.title('Chart')
Out[306]:
<matplotlib.text.Text at 0x2b0a87f0>

5.Pivots

In [16]:
pivot1 = df.pivot_table(index='hour',columns = 'day',values=['count'],aggfunc=np.sum,margins=True)
pivot1.head(5)
Out[16]:
count
day A:Mon B:Tues C:Weds D:Thurs E:Fri F:Sat G:Sun All
hour
0 2307 1749 2356 2436 3407 6482 6351 25088
1 1175 750 1015 1002 1565 4621 5244 15372
2 672 397 527 539 802 3323 3999 10259
3 335 234 305 297 392 1522 2006 5091
4 393 325 287 334 350 506 637 2832
In [17]:
aggFunc = {'casual': sum,'registered': np.std}
pivot2 = df.pivot_table(index='hour',columns = 'day',values=['casual','registered'],aggfunc=aggFunc,margins=True)
pivot2.head(5)
Out[17]:
registered casual
day A:Mon B:Tues C:Weds D:Thurs E:Fri F:Sat G:Sun All A:Mon B:Tues C:Weds D:Thurs E:Fri F:Sat G:Sun All
hour
0 17.398124 11.165378 31.779075 16.989674 19.411148 41.301382 32.472502 35.255433 672 371 375 406 621 1094 1153 4692
1 13.623544 5.376077 15.342829 6.635532 10.236102 25.503841 27.062773 27.764728 340 185 174 194 309 750 1005 2957
2 6.094662 3.024864 11.432318 3.454889 4.978301 17.352354 19.533542 21.083148 232 119 105 122 181 608 792 2159
3 3.545727 2.022260 3.332322 2.325905 2.821876 8.817087 10.555702 9.484696 84 40 44 48 78 345 522 1161
4 2.930241 2.337302 2.637093 2.913587 3.400502 2.731539 4.312529 3.207394 91 42 32 41 41 133 178 558

6.Concatinating + Merging

In [321]:
left = df[df['season']==1]
right = df[df['season']==2]

result = pd.concat([left,right])

result['season'].max()
Out[321]:
2
In [9]:
left = df.drop(['count', 'registered', 'casual','date','time'],1)
right = df.drop(['season','holiday','workingday','weather','temp',
                 'atemp','humidity','windspeed','month','hour','day'],1)

result = pd.merge(left,right, how='inner',left_index=True,right_index=True) 
result.head(1)
Out[9]:
season holiday workingday weather temp atemp humidity windspeed month hour day seasonName_x casual registered count date time seasonName_y
datetime
2011-01-01 1 0 0 1 9.84 14.395 81 0 1 0 F:Sat Spring 3 13 16 2011-01-01 00:00:00 Spring
In [10]:
left = left.reset_index()
right = right.reset_index()

pd.merge(left,right, on='datetime', how='left').head(1)
Out[10]:
datetime season holiday workingday weather temp atemp humidity windspeed month hour day seasonName_x casual registered count date time seasonName_y
0 2011-01-01 1 0 0 1 9.84 14.395 81 0 1 0 F:Sat Spring 3 13 16 2011-01-01 00:00:00 Spring

More Summary Stats

In [8]:
#Basic counting Procedures
print df.season.count()
print df['season'].count()
print df['season'].sum()
print df['season'].min()
print np.average(df.season)

print df.date.max()
print df.date.min()
print df.date.max() - df.date.min()
print df['count'].sum()/(df.date.max() - df.date.min()).days

df.isnull()
df[1].fillna(0, inplace=True)
1
4
2.50661399963
27287
10886
2686

More Filtering & Plots

In [135]:
#Filtering and Counting
dfbit = df[(df['season'] == 1) | (df['season'] == 2)]
dfbit = df[(df['season'] == 1) & (df['season'] == 2)]

dfbit = df.query('season>1').query('season<4')

df.query('season in (1,2,3)')


g1 = df[['count','season']].groupby(['season']).sum().sort_values('count',ascending = False)
g1


view = df[['casual','registered']].groupby(df.holiday).sum()
view
Out[135]:
casual registered
holiday
0 376964 1650704
1 15171 42637
In [31]:
view.plot(kind='bar')
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x20b28898>
In [ ]:
df['casual'][:50].plot(figsize=(13,3),title = 'Graph')
df['registered'][:50].plot()
plt.legend()
In [231]:
x = df['count']
y = df['temp']
plt.scatter(x,y)
plt.xlim((0,1500))
Out[231]:
(0, 1500)

More Pivots

In [219]:
df.pivot_table(index= 'hour',columns='day',values=['registered'], margins=True,aggfunc = np.sum)

aggFunc = {'casual': sum,
           'registered': np.average
           }
pd.pivot_table(df,index='holiday',aggfunc=aggFunc,values=['casual','registered'], margins=True)
Out[219]:
casual registered
holiday
0 376964 156.094941
1 15171 137.096463
All 392135 155.552177

Concatingating + Joining

In [224]:
#concatinating
left = df[df['season'] == 1]
right = df[df['season'] == 2]
print left.season.max()


result = pd.concat([left,right])

print result.season.max()
1
2
In [122]:
#joining dataframes
left = df.drop(['count', 'registered', 'casual','date','time'],1)
right = df.drop(['season','holiday','workingday','weather','temp',
                 'atemp','humidity','windspeed','month','hour','day'],1)

result = pd.merge(left,right, how='inner',left_index=True,right_index=True) 
result.head(1)
Out[122]:
season holiday workingday weather temp atemp humidity windspeed month hour day casual registered count date time
datetime
2011-01-01 1 0 0 1 9.84 14.395 81 0 1 0 F:Sat 3 13 16 2011-01-01 00:00:00
In [44]:
left = left.reset_index()
right = right.reset_index()

#result = left.join(right)

pd.merge(left,right, on='datetime', how='left').head(1)
Out[44]:
datetime season holiday workingday weather temp atemp humidity windspeed month hour day casual registered count date time
0 2011-01-01 1 0 0 1 9.84 14.395 81 0 1 0 F:Sat 3 13 16 2011-01-01 00:00:00

Traditional SQL

In [ ]:
!pip install pandasql
In [51]:
import pandasql as ps
In [325]:
q1 = """SELECT count(registered) as 'COUNT' FROM df Where season =1 """
 
df1 = ps.sqldf(q1, locals())
df1
Out[325]:
COUNT
0 2686

Some Visualizations

In [4]:
hour = df[['casual', 'registered']].groupby(df.index.hour).sum()
fig1 = hour.iplot(kind='line',subplots = True,shape=(2,1)
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Cyclist per Hour'),asFigure=True)
fig1['layout']['yaxis1'].update({'title':'Count'})
fig1['layout']['xaxis2'].update({'title':'Hour'})
fig1['layout'].update({'height':400,'width':900})
iplot(fig1)
In [5]:
day_hour_events = df[['day','hour','count']].groupby(['hour','day']).sum().reset_index()
t = day_hour_events.pivot(index='hour', columns='day', values='count').fillna(method='ffill')


fig2 = t.iplot(kind='line'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Cyclist per Hour by Day of Week'),asFigure=True)
fig2['layout']['yaxis'].update({'title':'Count'})
fig2['layout']['xaxis'].update({'title':'Hour'})
fig2['layout'].update({'height':400,'width':900})
iplot(fig2)
In [6]:
day = df[['date', 'count']].groupby('date').sum()

fig3 = day.iplot(kind='line'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Bike Usage'),asFigure=True)
fig3['layout']['yaxis'].update({'title':'Count'})
fig3['layout']['xaxis'].update({'title':'Date'})
fig3['layout'].update({'height':400,'width':900})
iplot(fig3)
In [7]:
#keep_cols=['time','day','count','casual','registered']
counts = df[['time','day','count']].groupby(['time','day']).sum().reset_index()
allUsers = counts.set_index([ "time", "day"]).unstack("time")  #turns time rows into columns
allUsers.columns = allUsers.columns.droplevel()

iplot(allUsers.iplot(kind='heatmap', colorscale='reds' ,xTitle = "Day of Week",yTitle ="Hour"                    
                     , title="All Bike Rentals",asFigure=True))
In [8]:
countsCas = df[['time','day','casual']].groupby(['time','day']).sum().reset_index()
casUsers = countsCas.set_index([ "time", "day"]).unstack("time")  #turns time rows into columns
casUsers.columns = casUsers.columns.droplevel()

iplot(casUsers.iplot(kind='heatmap', colorscale='blues' ,xTitle = "Day of Week",yTitle ="Hour"                    
                     , title="Casual Bike Rentals",asFigure=True))
In [10]:
countsReg = df[['time','day','registered']].groupby(['time','day']).sum().reset_index()
regUsers = countsReg.set_index([ "time", "day"]).unstack("time")  #turns time rows into columns
regUsers.columns = regUsers.columns.droplevel()

iplot(regUsers.iplot(kind='heatmap', colorscale='greens' ,xTitle = "Day of Week",yTitle ="Hour"                    
                     , title="Registered Bike Rentals",asFigure=True))

Temperature Corelation

In [11]:
daily_grouped = df[['temp', 'count']].groupby(df.index.date).mean().reset_index()

fig4 = daily_grouped.iplot(kind='scatter',mode='markers',x='temp',y='count'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Bike Usage by Temperature'),asFigure=True)
fig4['layout']['yaxis'].update({'title':'Count'})
fig4['layout']['xaxis'].update({'title':'Temp'})
fig4['layout'].update({'height':400,'width':900})
iplot(fig4)

Seasonal Effects

In [12]:
seasons = df[['season','casual','registered','hour']].groupby(['hour','season']).sum().reset_index()

seas = {1:'Spring',2:'Summer',3:'Autumn',4:'Winter'}
seasons['season'] = seasons['season'].apply(lambda x: seas[x])

sc = seasons.pivot(index='hour', columns='season', values='casual').fillna(method='ffill')
sr = seasons.pivot(index='hour', columns='season', values='registered').fillna(method='ffill')

fig5 = sr.iplot(kind='line'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Registered Cyclists by Season'),asFigure=True)
fig5['layout']['yaxis'].update({'title':'Count'})
fig5['layout']['xaxis'].update({'title':'Hour'})
fig5['layout'].update({'height':300,'width':900})
iplot(fig5)
In [13]:
fig6 = sc.iplot(kind='line'
                  ,layout=Layout(paper_bgcolor='rgba(0,0,0,0)',title='Casual Cyclists by Season'),asFigure=True)
fig6['layout']['yaxis'].update({'title':'Count'})
fig6['layout']['xaxis'].update({'title':'Hour'})
fig6['layout'].update({'height':300,'width':900})
iplot(fig6)

There is an endless amount of descriptive analysis we could continue to do:

  • Workingday variable
  • Holiday variable
  • Windspeed variable

Modelling

Based on some of the observations above, we know that the behaviour of cyclists who use bikes casually vs. those who are registered is different. Therefore, we're going to build separate models, one for each of the 'casual' and 'registered' users.

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import auc, precision_recall_curve, roc_curve, accuracy_score


def rmsele(actual, pred):
    """
    Given a column of predictions and a column of actuals, calculate the RMSELE
    """
    squared_errors = (np.log(pred + 1) - np.log(actual + 1)) ** 2
    mean_squared = np.sum(squared_errors) / len(squared_errors)
    return np.sqrt(mean_squared)

Prepare the data for modelling

In [11]:
df['day'] = df.index.dayofweek
df['year'] = df.index.year



df1 = df.drop(['count', 'registered', 'casual','date','time'], 1)
df1.reset_index(inplace=True)
df1 = df1.drop('datetime', 1)

X = df1
y = df['registered']
features = X.columns

# Split the data into test and training groups
X_train, X_test, y_train, y_test = train_test_split(X, y)
print ('Data Set Cleaned and ready for Training')
df1.head()
Data Set Cleaned and ready for Training
Out[11]:
season holiday workingday weather temp atemp humidity windspeed month hour day year
0 1 0 0 1 9.84 14.395 81 0 1 0 5 2011
1 1 0 0 1 9.02 13.635 80 0 1 1 5 2011
2 1 0 0 1 9.02 13.635 80 0 1 2 5 2011
3 1 0 0 1 9.84 14.395 75 0 1 3 5 2011
4 1 0 0 1 9.84 14.395 75 0 1 4 5 2011

Use Grid Search to optimise Random Forest parameters

In [25]:
rmsele_scorer = make_scorer(rmsele, greater_is_better=False)

# Optimisation of parameters
parameter_space = [{'max_features': ['sqrt', 'log2', 'auto'], 'max_depth':[5,8,12], 'min_samples_leaf':[2,5,10]}]
rf = GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000),parameter_space, cv=3, verbose=2, 
                  scoring=rmsele_scorer).fit(X_train, y_train)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 -   3.2s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 -   3.0s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 -   3.5s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 -   3.0s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 -   3.0s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 -   3.4s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 -   3.1s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 -   3.0s
[CV] max_features=sqrt, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 -   3.1s
[CV] max_features=log2, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 -   3.1s
[CV] max_features=log2, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 -   3.1s
[CV] max_features=log2, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 -   3.0s
[CV] max_features=log2, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 -   3.2s
[CV] max_features=log2, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 -   3.2s
[CV] max_features=log2, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 -   3.2s
[CV] max_features=log2, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=5, min_samples_leaf=10 -   3.3s
[CV] max_features=log2, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=5, min_samples_leaf=10 -   3.2s
[CV] max_features=log2, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=5, min_samples_leaf=10 -   3.3s
[CV] max_features=auto, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 -   6.4s
[CV] max_features=auto, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 -   6.4s
[CV] max_features=auto, max_depth=5, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 -   6.3s
[CV] max_features=auto, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 -   6.5s
[CV] max_features=auto, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 -   6.4s
[CV] max_features=auto, max_depth=5, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 -   6.3s
[CV] max_features=auto, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=5, min_samples_leaf=10 -   6.5s
[CV] max_features=auto, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=5, min_samples_leaf=10 -   6.4s
[CV] max_features=auto, max_depth=5, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=5, min_samples_leaf=10 -   6.3s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 -   4.2s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 -   4.1s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 -   4.2s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 -   4.3s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 -   4.1s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 -   4.3s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 -   4.0s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 -   4.1s
[CV] max_features=sqrt, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 -   4.0s
[CV] max_features=log2, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 -   4.1s
[CV] max_features=log2, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 -   4.3s
[CV] max_features=log2, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 -   4.2s
[CV] max_features=log2, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 -   4.2s
[CV] max_features=log2, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 -   4.1s
[CV] max_features=log2, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 -   4.1s
[CV] max_features=log2, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=8, min_samples_leaf=10 -   4.0s
[CV] max_features=log2, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=8, min_samples_leaf=10 -   4.0s
[CV] max_features=log2, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=log2, max_depth=8, min_samples_leaf=10 -   4.2s
[CV] max_features=auto, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 -   9.6s
[CV] max_features=auto, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 -   9.6s
[CV] max_features=auto, max_depth=8, min_samples_leaf=2 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 -   9.9s
[CV] max_features=auto, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 -   9.7s
[CV] max_features=auto, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 -   9.7s
[CV] max_features=auto, max_depth=8, min_samples_leaf=5 ..............
[CV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 -   9.1s
[CV] max_features=auto, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=8, min_samples_leaf=10 -   8.7s
[CV] max_features=auto, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=8, min_samples_leaf=10 -   8.5s
[CV] max_features=auto, max_depth=8, min_samples_leaf=10 .............
[CV] .... max_features=auto, max_depth=8, min_samples_leaf=10 -   8.6s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 -   5.4s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 -   5.3s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 -   5.4s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 -   4.7s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 -   4.8s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 -   4.7s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 -   4.3s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 -   4.3s
[CV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 -   4.3s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=2 -   5.4s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=2 -   5.3s
[CV] max_features=log2, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=2 -   5.5s
[CV] max_features=log2, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=5 -   4.7s
[CV] max_features=log2, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=5 -   4.9s
[CV] max_features=log2, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=log2, max_depth=12, min_samples_leaf=5 -   4.7s
[CV] max_features=log2, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=log2, max_depth=12, min_samples_leaf=10 -   4.4s
[CV] max_features=log2, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=log2, max_depth=12, min_samples_leaf=10 -   4.5s
[CV] max_features=log2, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=log2, max_depth=12, min_samples_leaf=10 -   4.4s
[CV] max_features=auto, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=2 -  14.4s
[CV] max_features=auto, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=2 -  14.9s
[CV] max_features=auto, max_depth=12, min_samples_leaf=2 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=2 -  14.4s
[CV] max_features=auto, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=5 -  13.2s
[CV] max_features=auto, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=5 -  13.0s
[CV] max_features=auto, max_depth=12, min_samples_leaf=5 .............
[CV] .... max_features=auto, max_depth=12, min_samples_leaf=5 -  12.7s
[CV] max_features=auto, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=auto, max_depth=12, min_samples_leaf=10 -  11.1s
[CV] max_features=auto, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=auto, max_depth=12, min_samples_leaf=10 -  11.2s
[CV] max_features=auto, max_depth=12, min_samples_leaf=10 ............
[CV] ... max_features=auto, max_depth=12, min_samples_leaf=10 -  10.8s
[Parallel(n_jobs=1)]: Done  40 tasks       | elapsed:  2.9min
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  8.1min finished

In [26]:
print rf.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
In [40]:
rf_opt = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
rf_opt.fit(X_train, y_train)
Out[40]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
In [41]:
preds = rf_opt.predict(X_test)
In [42]:
plt.scatter(preds, y_test)
plt.title("Random Forest Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
Out[42]:
(0, 1000)
In [43]:
#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(rf_opt.predict(X_train), y_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(rf_opt.predict(X_test), y_test)
print "Training accuracy: %0.2f%%" % (100*rf_opt.score(X_train, y_train))
print "Test accuracy: %0.2f%%" % (100*rf_opt.score(X_test, y_test)) + "\n"
Root Mean Squared Logarithmic Error Train:  0.21923700022
Root Mean Squared Logarithmic Error Test:  0.325501592158
Training accuracy: 97.63%
Test accuracy: 94.65%

In [44]:
numfeat = len(features)
indices = np.argsort(rf_opt.feature_importances_)[::-1][:numfeat]

plt.bar(xrange(numfeat), rf_opt.feature_importances_[indices], align='center', alpha=.5)
plt.xticks(xrange(numfeat), features[indices], rotation='vertical', fontsize=12)
plt.xlim([-1, numfeat])
plt.ylabel('Feature %', fontsize=12)
plt.title('Feature importance computed by Random Forest', fontsize=16)
Out[44]:
<matplotlib.text.Text at 0x371a4080>
In [4]:
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
In [5]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)
In [12]:
clf.fit(X_train, y_train)
Out[12]:
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.01, loss='ls',
             max_depth=4, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=1,
             min_weight_fraction_leaf=0.0, n_estimators=500,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
In [15]:
preds = clf.predict(X_test)
plt.scatter(preds, y_test)
plt.title("Gradient Boosting Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
Out[15]:
(0, 1000)
In [16]:
#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(clf.predict(X_train), y_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(clf.predict(X_test), y_test)
print "Training accuracy: %0.2f%%" % (100*clf.score(X_train, y_train))
print "Test accuracy: %0.2f%%" % (100*clf.score(X_test, y_test)) + "\n"
Root Mean Squared Logarithmic Error Train:  0.565749310279
Root Mean Squared Logarithmic Error Test:  0.575569763951
Training accuracy: 88.48%
Test accuracy: 88.94%

In [ ]: